import numpy as np 
import httpx
from selectolax.parser import HTMLParser
from dataclasses import dataclass, asdict
import re
import csv
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
%matplotlib inline
sns.set_style("whitegrid")
plt.style.use("fivethirtyeight")
sns.set_theme()


@dataclass
class Player:
    name: str
    position: str
    age : int
    club : str
    matches : int
    goals : int
    assists : int
    subOn : int
    subOff: int
    value : float
        
        
def parse_players(html):
    results=[]
    for x in {'odd', 'even'}:
    
        players= html.css("tr."+x)
    
        for player in players:
            new_data=Player(name= player.css_first("td.hauptlink").text(),
                            position= player.css("tr")[2].text(),
                            age= player.css("td.zentriert")[1].text(),
                            club= player.css("a")[2].attributes.get('title', ''),
                            matches= player.css("td.zentriert")[4].text(),
                            goals= player.css("td.zentriert")[5].text(),
                            assists= player.css("td.zentriert")[7].text(),
                            subOn= player.css("td.zentriert")[11].text(),
                            subOff= player.css("td.zentriert")[12].text(),
                            value= re.findall("\d+\.\d+", player.css_first("td.rechts").text())[0]

            )

            results.append(asdict(new_data))
    return(results)
  
def to_csv(results):
    with open("results.csv", "a", encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames = ["name", "position","age", "club", "matches",\
                                                 "goals", "assists", "subOn", "subOff", "value"])
        writer.writerows(results)
        
def get_html(page):
    url = f"https://www.transfermarkt.com/spieler-statistik/wertvollstespieler/marktwertetop?land_id=0&ausrichtung=alle&spielerposition_id=alle&altersklasse=alle&jahrgang=0&kontinent_id=0&plus=1&page={page}"
    resp = httpx.get(url)
    return HTMLParser(resp.text)


@dataclass
class Team:
    club: str
    country: str
   
        
def parse_teams(html):
    results=[]
    for x in {'odd', 'even'}:
        teams= html.css("tr."+x)
    
        for team in teams:
            new_data=Team(club= team.css("a")[0].attributes.get('title', ''),
                            country=team.css("img")[1].attributes.get('title', '')
            )

            results.append(asdict(new_data))
    return(results)
  
def to_clubcsv(results):
    with open("clubs.csv", "a", encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames = ["club", "country"])
        writer.writerows(results)
        
def get_club_html(page):
    url = f"https://www.transfermarkt.us/uefa/klubrangliste/statistik/stat/page/{page}/"
    resp = httpx.get(url)
    return HTMLParser(resp.text)


#Looping over the pages to collect data:
for page in range (1,10):
        html=get_html(page)
        res= parse_players(html)
        to_csv(res)
for page in range (1,5):
        html=get_club_html(page)
        res= parse_teams(html)
        to_clubcsv(res)


#Read datafiles:
#Read player file that has players informations
df=pd.read_csv('results.csv', sep=',', names=["name", "position","age", "club", "matches", "goals", "assists", "subOn",\
                                              "subOff", "value"] )   
#Read club file that has clubs informations
clubs=pd.read_csv("clubs.csv", sep=',', names=["club", "country"] )
#Merging both files
data=df.merge(clubs, how='left', on='club')

data.sort_values("value",ascending=False, inplace=True)
data=data.reset_index(drop=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 225 entries, 0 to 224
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   name      225 non-null    object 
 1   position  225 non-null    object 
 2   age       225 non-null    int64  
 3   club      225 non-null    object 
 4   matches   225 non-null    int64  
 5   goals     225 non-null    int64  
 6   assists   225 non-null    int64  
 7   subOn     225 non-null    int64  
 8   subOff    225 non-null    int64  
 9   value     225 non-null    float64
 10  country   200 non-null    object 
dtypes: float64(1), int64(6), object(4)
memory usage: 19.5+ KB


data[data.country.isna()]


#Filling missing data
data.loc[210,'country']='Spain'
data.loc[213,'country']='Spain'
data.fillna('England', inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 225 entries, 0 to 224
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   name      225 non-null    object 
 1   position  225 non-null    object 
 2   age       225 non-null    int64  
 3   club      225 non-null    object 
 4   matches   225 non-null    int64  
 5   goals     225 non-null    int64  
 6   assists   225 non-null    int64  
 7   subOn     225 non-null    int64  
 8   subOff    225 non-null    int64  
 9   value     225 non-null    float64
 10  country   225 non-null    object 
dtypes: float64(1), int64(6), object(4)
memory usage: 19.5+ KB


#All position in our data:
set(data['position'])

{'Attacking Midfield',
 'Central Midfield',
 'Centre-Back',
 'Centre-Forward',
 'Defensive Midfield',
 'Goalkeeper',
 'Left Winger',
 'Left-Back',
 'Right Winger',
 'Right-Back',
 'Second Striker'}


#We take only attackers
data=data[data['position'].isin(['Attacking Midfield',
'Centre-Forward',
'Left Winger',
'Right Winger',
'Second Striker'])]


#We create a new variable: goal Contribution Ratio, which is a measure the contribution of a player to their team's goals.
# It is calculated by dividing the number of goals and assists a player has by the total number of games played.
data['goalContributionRatio']=(data['goals']+data['assists'])/data['matches']


data.head(5)


#Visualizing categorical data
plt.figure(figsize=(20,10))
plt.subplot(1,2,1)
plt1=sns.countplot(data=data, y='country', order = data['country'].value_counts().index)
plt.title('Number of Players playing at each league',fontsize=24)
plt.subplot(1,2,2)
plt2=plt.pie(data['club'].value_counts(), labels=data['club'].value_counts().index, autopct='%.0f%%')
plt.title('Clubs of top valued players',fontsize=24)
plt.tight_layout()

plt.show()


#Make it beautiful and add titles
plt.figure(figsize=(20,10))
plt.subplot(1,2,1)
plt1=sns.boxplot(x='position', y='value', data=data)
plt.title('Distribution of players value by position',fontsize=24)
plt.subplot(1,2,2)
plt2=sns.boxplot(x='country', y='value', data=data)
plt.title('Distribution of players value by league',fontsize=24)
plt.tight_layout()
plt.show()


data.drop(['position','club'], axis=1, inplace=True)


#Visualize correlation matrix
df=data.copy()
labelencoder = LabelEncoder()
df['country'] = labelencoder.fit_transform(df['country'])
fig, ax = plt.subplots(figsize=(20,20))
sns.heatmap(df.corr(), annot=True, linewidths=.1,  cmap="RdBu");


sns.pairplot(df);


y=df.value
X=df[['matches','age','country','goalContributionRatio']]


X=df.drop(['name','value'], axis=1)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
pipeline = Pipeline([('std_scalar', StandardScaler())])
X_train = pipeline.fit_transform(X_train)
X_test = pipeline.transform(X_test)
reg = LinearRegression()
reg.fit(X_train, y_train)

LinearRegression()


from sklearn import metrics
from sklearn.model_selection import cross_val_score

def cross_val(model):
    pred = cross_val_score(model, X, y, cv=10)
    return pred.mean()

def print_evaluate(true, predicted):  
    mae = metrics.mean_absolute_error(true, predicted)
    mse = metrics.mean_squared_error(true, predicted)
    rmse = np.sqrt(metrics.mean_squared_error(true, predicted))
    r2_square = metrics.r2_score(true, predicted)
    print('MAE:', mae)
    print('MSE:', mse)
    print('RMSE:', rmse)
    print('R2 Square', r2_square)
    print('--------------------------------')
    
def evaluate(true, predicted):
    mae = metrics.mean_absolute_error(true, predicted)
    mse = metrics.mean_squared_error(true, predicted)
    rmse = np.sqrt(metrics.mean_squared_error(true, predicted))
    r2_square = metrics.r2_score(true, predicted)
    return mae, mse, rmse, r2_square
test_pred = reg.predict(X_test)
train_pred = reg.predict(X_train)
print('Model Evaluation:\n--------------------------------')
print_evaluate(y_test, test_pred)

Model Evaluation:
--------------------------------
MAE: 18.869821333901264
MSE: 504.7084958543172
RMSE: 22.465718235888147
R2 Square 0.4997574197805914
--------------------------------


sns.regplot(x=y_test, y=test_pred,line_kws={'lw':1,'color': '#FF4500' ,'linestyle':'-.'}, marker="o");


sns.residplot(y_test,test_pred);

C:\Users\youss\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(


sns.displot(y_test-test_pred, kde=True);

	name	position	age	club	matches	goals	assists	subOn	subOff	value	country
63	Bruno Guimarães	Defensive Midfield	25	Newcastle United	35	4	5	3	11	60.0	NaN
73	Moisés Caicedo	Defensive Midfield	21	Brighton & Hove Albion	40	1	1	2	5	55.0	NaN
84	Ivan Toney	Centre-Forward	27	Brentford FC	35	21	5	2	4	50.0	NaN
87	Alexander Isak	Centre-Forward	23	Newcastle United	29	11	3	9	17	50.0	NaN
113	Sven Botman	Centre-Back	23	Newcastle United	39	0	0	3	2	45.0	NaN
116	Amadou Onana	Defensive Midfield	21	Everton FC	34	1	1	3	13	42.0	NaN
117	Alexis Mac Allister	Central Midfield	24	Brighton & Hove Albion	39	11	3	4	9	42.0	NaN
142	Anthony Gordon	Left Winger	22	Newcastle United	31	3	1	14	11	40.0	NaN
147	James Ward-Prowse	Central Midfield	28	Southampton FC	42	9	3	1	1	38.0	NaN
150	Joelinton	Attacking Midfield	26	Newcastle United	37	8	3	3	3	38.0	NaN
154	Miguel Almirón	Right Winger	29	Newcastle United	39	11	3	6	26	35.0	NaN
156	João Palhinha	Defensive Midfield	27	Fulham FC	41	4	1	4	8	35.0	NaN
165	Allan Saint-Maximin	Left Winger	26	Newcastle United	26	1	5	13	10	35.0	NaN
181	Marc Guéhi	Centre-Back	22	Crystal Palace	38	1	0	1	1	35.0	NaN
188	Douglas Luiz	Central Midfield	24	Aston Villa	36	5	5	4	8	35.0	NaN
195	Joachim Andersen	Centre-Back	26	Crystal Palace	35	1	0	0	3	32.0	NaN
197	Ollie Watkins	Centre-Forward	27	Aston Villa	36	15	6	2	9	32.0	NaN
201	Jacob Ramsey	Central Midfield	21	Aston Villa	38	5	6	8	18	32.0	NaN
203	Robert Sánchez	Goalkeeper	25	Brighton & Hove Albion	26	0	0	0	1	32.0	NaN
207	Brennan Johnson	Attacking Midfield	21	Nottingham Forest	45	12	3	6	14	30.0	NaN
208	Eberechi Eze	Attacking Midfield	24	Crystal Palace	36	8	3	10	15	30.0	NaN
215	Oihan Sancet	Attacking Midfield	23	Athletic Bilbao	37	8	2	4	28	30.0	NaN
216	Gabri Veiga	Attacking Midfield	20	Celta de Vigo	36	9	4	10	22	30.0	NaN
218	Morgan Gibbs-White	Attacking Midfield	23	Nottingham Forest	38	5	7	3	9	30.0	NaN
222	Cheick Doucouré	Defensive Midfield	23	Crystal Palace	33	0	3	0	18	30.0	NaN

📥 Import Necessary Libraries¶

🌐 Data Collection and Web Scrapping¶

💾 Data Preparation & Cleaning¶

Some of the countries are missing, let's fix that!¶

All missing values are for clubs in England except two rows (210 and 213) which are Spanish clubs¶

Since we have limited data, we will focus on goals and assits. Therefore, we will analyze only attackers.¶

📊 Exploratory Data Aanalysis¶

We drop Club and Position a sthey seem to have little effect on the value¶

📈 Linear Regression¶

	name	position	age	club	matches	goals	assists	subOn	subOff	value	country	goalContributionRatio
0	Kylian Mbappé	Centre-Forward	24	Paris Saint-Germain	45	39	10	4	10	180.0	France	1.088889
1	Erling Haaland	Centre-Forward	22	Manchester City	51	57	9	2	23	170.0	England	1.294118
2	Vinicius Junior	Left Winger	22	Real Madrid	52	22	20	2	18	120.0	Spain	0.807692
4	Bukayo Saka	Right Winger	21	Arsenal FC	52	15	13	9	20	110.0	England	0.538462
5	Jamal Musiala	Attacking Midfield	20	Bayern Munich	49	15	14	13	30	110.0	Germany	0.591837